Goal: can machine learning methods help us to associate metabolites with leaf length?
Previously (script 11b2) I filtered out unnamed metabolites. Here I keep them all.
Also I will PC separately for root and leaf.
library(glmnet)
library(relaimpo)
library(tidyverse)
get leaflength data
leaflength <- read_csv("../../plant/output/leaf_lengths_metabolite.csv") %>%
mutate(pot=str_pad(pot, width=3, pad="0"),
sampleID=str_c("wyo", genotype, pot, sep="_")) %>%
select(sampleID, leaf_avg_std)
── Column specification ────────────────────────────────────────────────────────────────────
cols(
pot = col_double(),
soil = col_character(),
genotype = col_character(),
trt = col_character(),
leaf_avg = col_double(),
leaf_avg_std = col_double()
)
leaflength %>% arrange(sampleID)
get and wrangle metabolite data
met_raw <-read_csv("../input/metabolites_set1.csv")
── Column specification ────────────────────────────────────────────────────────────────────
cols(
.default = col_double(),
tissue = col_character(),
soil = col_character(),
genotype = col_character(),
autoclave = col_character(),
time_point = col_character(),
concatenate = col_character()
)
ℹ Use `spec()` for the full column specifications.
met <- met_raw %>%
mutate(pot=str_pad(pot, width = 3, pad = "0")) %>%
mutate(sampleID=str_c("wyo", genotype, pot, sep="_")) %>%
select(sampleID, genotype, tissue, sample_mass = `sample_mass mg`, !submission_number:concatenate) %>%
pivot_longer(!sampleID:sample_mass, names_to = "metabolite", values_to = "met_amount") %>%
#adjust by sample mass
mutate(met_per_mg=met_amount/sample_mass) %>%
#scale and center
group_by(metabolite, genotype, tissue) %>%
mutate(met_per_mg=scale(met_per_mg),
met_amt=scale(met_amount)
) %>%
pivot_wider(id_cols = sampleID,
names_from = c(tissue, metabolite),
values_from = starts_with("met_"),
names_sep = "_")
met
split this into two data frames, one normalized by tissue amount and one not.
met_per_mg <- met %>% select(sampleID, starts_with("met_per_mg")) %>%
as.data.frame() %>% column_to_rownames("sampleID")
met_amt <- met %>% select(sampleID, starts_with("met_amt")) %>%
as.data.frame() %>% column_to_rownames("sampleID")
get leaf data order to match
leaflength <- leaflength[match(met$sampleID, leaflength$sampleID),]
leaflength
met_per_mg.leaf_PCA <- met_per_mg %>%
select(matches("_leaf_")) %>%
prcomp(center = FALSE, scale. = FALSE) #already centered and scaled
names(met_per_mg.leaf_PCA)
[1] "sdev" "rotation" "center" "scale" "x"
tibble(variance=met_per_mg.leaf_PCA$sdev^2, PC=str_c("PC",
str_pad(1:length(met_per_mg.leaf_PCA$sdev), width = 2, pad="0"))) %>%
mutate(percent_var=100*variance/sum(variance),
cumulative_var=cumsum(percent_var)) %>%
magrittr::extract(1:15,) %>%
ggplot(aes(x=PC, y=percent_var)) +
geom_col(fill="skyblue") +
geom_line(aes(y=cumulative_var), group="") +
ggtitle("percent variance explained, named, normalized leaf metabolites")
met_per_mg.root_PCA <- met_per_mg %>%
select(matches("_root_")) %>%
prcomp(center = FALSE, scale. = FALSE) #already centered and scaled
names(met_per_mg.root_PCA)
[1] "sdev" "rotation" "center" "scale" "x"
tibble(variance=met_per_mg.root_PCA$sdev^2, PC=str_c("PC",
str_pad(1:length(met_per_mg.root_PCA$sdev), width = 2, pad="0"))) %>%
mutate(percent_var=100*variance/sum(variance),
cumulative_var=cumsum(percent_var)) %>%
magrittr::extract(1:15,) %>%
ggplot(aes(x=PC, y=percent_var)) +
geom_col(fill="skyblue") +
geom_line(aes(y=cumulative_var), group="") +
ggtitle("percent variance explained, named, normalized root metabolites")
met_amt.leaf_PCA <- met_amt %>%
select(matches("_leaf_")) %>%
prcomp(center = FALSE, scale. = FALSE) #already centered and scaled
names(met_per_mg.leaf_PCA)
[1] "sdev" "rotation" "center" "scale" "x"
tibble(variance=met_amt.leaf_PCA$sdev^2, PC=str_c("PC",
str_pad(1:length(met_amt.leaf_PCA$sdev), width = 2, pad="0"))) %>%
mutate(percent_var=100*variance/sum(variance),
cumulative_var=cumsum(percent_var)) %>%
magrittr::extract(1:15,) %>%
ggplot(aes(x=PC, y=percent_var)) +
geom_col(fill="skyblue") +
geom_line(aes(y=cumulative_var), group="") +
ggtitle("percent variance explained, named, raw leaf metabolites")
met_amt.root_PCA <- met_amt %>%
select(matches("_root_")) %>%
prcomp(center = FALSE, scale. = FALSE) #already centered and scaled
names(met_per_mg.root_PCA)
[1] "sdev" "rotation" "center" "scale" "x"
tibble(variance=met_amt.root_PCA$sdev^2, PC=str_c("PC",
str_pad(1:length(met_amt.root_PCA$sdev), width = 2, pad="0"))) %>%
mutate(percent_var=100*variance/sum(variance),
cumulative_var=cumsum(percent_var)) %>%
magrittr::extract(1:15,) %>%
ggplot(aes(x=PC, y=percent_var)) +
geom_col(fill="skyblue") +
geom_line(aes(y=cumulative_var), group="") +
ggtitle("percent variance explained, named, raw root metabolites")
are the PCs normalized?
colMeans(met_amt.leaf_PCA$x) %>% round(3) #yes centered
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
PC19 PC20 PC21 PC22 PC23 PC24 PC25 PC26 PC27 PC28 PC29 PC30 PC31 PC32 PC33 PC34 PC35 PC36
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
apply(met_amt.leaf_PCA$x, 2, sd) %>% round(2) #not scaled
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15
11.94 8.37 6.74 5.84 5.46 5.32 5.02 4.55 4.38 4.11 3.87 3.80 3.73 3.56 3.44
PC16 PC17 PC18 PC19 PC20 PC21 PC22 PC23 PC24 PC25 PC26 PC27 PC28 PC29 PC30
3.32 3.27 3.17 3.16 3.08 2.94 2.90 2.86 2.78 2.73 2.61 2.56 2.53 2.40 2.39
PC31 PC32 PC33 PC34 PC35 PC36
2.33 2.26 2.22 2.14 0.00 0.00
combine the leaf and root, and then scale them:
met_per_mg.leaf_PCs <- met_per_mg.leaf_PCA$x
colnames(met_per_mg.leaf_PCs) <- str_c("leaf_", colnames(met_per_mg.leaf_PCs))
met_per_mg.root_PCs <- met_per_mg.root_PCA$x
colnames(met_per_mg.root_PCs) <- str_c("root_", colnames(met_per_mg.root_PCs))
met_per_mg.PCs <- cbind(met_per_mg.leaf_PCs, met_per_mg.root_PCs) %>%
scale()
met_amt.leaf_PCs <- met_amt.leaf_PCA$x
colnames(met_amt.leaf_PCs) <- str_c("leaf_", colnames(met_amt.leaf_PCs))
met_amt.root_PCs <- met_amt.root_PCA$x
colnames(met_amt.root_PCs) <- str_c("root_", colnames(met_amt.root_PCs))
met_amt.PCs <- cbind(met_amt.leaf_PCs, met_amt.root_PCs) %>%
scale()
also combine the rotations
met_per_mg.leaf_rotation <- met_per_mg.leaf_PCA$rotation %>%
as.data.frame() %>%
rename_with(~ str_c("leaf_", .x)) %>%
rownames_to_column("metabolite")
met_per_mg.root_rotation <- met_per_mg.root_PCA$rotation %>%
as.data.frame() %>%
rename_with(~ str_c("root_", .x)) %>%
rownames_to_column("metabolite")
met_per_mg.PC_rotation <- full_join(met_per_mg.leaf_rotation, met_per_mg.root_rotation, by="metabolite")
met_amt.leaf_rotation <- met_amt.leaf_PCA$rotation %>%
as.data.frame() %>%
rename_with(~ str_c("leaf_", .x)) %>%
rownames_to_column("metabolite")
met_amt.root_rotation <- met_amt.root_PCA$rotation %>%
as.data.frame() %>%
rename_with(~ str_c("root_", .x)) %>%
rownames_to_column("metabolite")
met_amt.PC_rotation <- full_join(met_amt.leaf_rotation, met_amt.root_rotation, by="metabolite")
met_per_mg_fit1LOO <- cv.glmnet(x=met_per_mg.PCs, y=leaflength$leaf_avg_std, nfolds = nrow(met_per_mg.PCs), alpha=1 )
Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(met_per_mg_fit1LOO)
bestlam=met_per_mg_fit1LOO$lambda.1se
NEXT STEP: Do a K-fold CV, repeat many times and average. Might as well do alpha while we are at it. If we are doing alpha, then we need to manually create our own folds list for each run
Fit 101 CVs for each of 11 alphas
set.seed(1245)
folds <- tibble(run=1:101) %>%
mutate(folds=map(run, ~ sample(rep(1:6,6))))
system.time (met_per_mg_multiCV <- expand_grid(run=1:100, alpha=round(seq(0,1,.1),1)) %>%
left_join(folds, by="run") %>%
mutate(fit=map2(folds, alpha, ~ cv.glmnet(x=met_per_mg.PCs, y=leaflength$leaf_avg_std, foldid = .x, alpha=.y
)))
#, lambda=exp(seq(-5,0,length.out = 50)) )))
) #100 seconds
user system elapsed
104.115 5.136 119.207
head(met_per_mg_multiCV)
for each fit, pull out the mean cv error, lambda, min lambda, and 1se lambda
met_per_mg_multiCV <- met_per_mg_multiCV %>%
mutate(cvm=map(fit, magrittr::extract("cvm")),
lambda=map(fit, magrittr::extract("lambda")),
lambda.min=map_dbl(fit, magrittr::extract("lambda.min" )),
lambda.1se=map_dbl(fit, magrittr::extract("lambda.1se")),
nzero=map(fit, magrittr::extract("nzero"))
)
head(met_per_mg_multiCV)
now calculate the mean and sem of cvm and min,1se labmdas. These need to be done separately because of the way the grouping works
met_per_mg_summary_cvm <- met_per_mg_multiCV %>% dplyr::select(-fit, -folds) %>%
unnest(c(cvm, lambda)) %>%
group_by(alpha, lambda) %>%
summarize(meancvm=mean(cvm), sem=sd(cvm)/sqrt(n()), high=meancvm+sem, low=meancvm-sem)
`summarise()` regrouping output by 'alpha' (override with `.groups` argument)
met_per_mg_summary_cvm
met_per_mg_summary_lambda <- met_per_mg_multiCV %>% dplyr::select(-fit, -folds, -cvm) %>%
group_by(alpha) %>%
summarize(
lambda.min.sd=sd(lambda.min),
lambda.min.mean=mean(lambda.min),
#lambda.min.med=median(lambda.min),
lambda.min.high=lambda.min.mean+lambda.min.sd,
#lambda.min.low=lambda.min.mean-lambda.min.sem,
#lambda.1se.sem=sd(lambda.1se)/sqrt(n()),
lambda.1se.mean=mean(lambda.1se),
#lambda.1se.med=median(lambda.1se),
#lambda.1se.high=lambda.1se+lambda.1se.sem,
#lambda.1se.low=lambda.1se-lambda.1se.sem,
nzero=nzero[1],
lambda=lambda[1]
)
`summarise()` ungrouping output (override with `.groups` argument)
met_per_mg_summary_lambda
plot it
met_per_mg_summary_cvm %>%
#filter(alpha!=0) %>% # worse than everything else and throwing the plots off
ggplot(aes(x=log(lambda), y= meancvm, ymin=low, ymax=high)) +
geom_ribbon(alpha=.25) +
geom_line(aes(color=as.character(alpha))) +
facet_wrap(~ as.character(alpha)) +
coord_cartesian(xlim=(c(-5,0))) +
geom_vline(aes(xintercept=log(lambda.min.mean)), alpha=.5, data=met_per_mg_summary_lambda) +
geom_vline(aes(xintercept=log(lambda.min.high)), alpha=.5, data=met_per_mg_summary_lambda, color="blue")
So overall these look more reasonable than the LOO plot.
Make a plot of MSE at minimum lambda for each alpha
met_per_mg_summary_cvm %>%
group_by(alpha) %>%
filter(rank(meancvm, ties.method = "first")==1) %>%
ggplot(aes(x=alpha,y=meancvm,ymin=low,ymax=high)) +
geom_ribbon(color=NA, fill="gray80") +
geom_line() +
geom_point()
not a particular large difference there, aside from 0.1 and even then, not too much better.
Plot the number of nzero coefficients
met_per_mg_summary_lambda %>%
unnest(c(lambda, nzero)) %>%
group_by(alpha) %>%
filter(abs(lambda.min.mean-lambda)==min(abs(lambda.min.mean-lambda)) ) %>%
ungroup() %>%
ggplot(aes(x=as.character(alpha), y=nzero)) +
geom_point() +
ggtitle("Number of non-zero coefficents at minimum lambda") +
ylim(0,36)
OK let’s do repeated test train starting from these CV lambdas
multi_tt <- function(lambda, alpha, n=10000, sample_size=36, train_size=30, x, y=leaflength$leaf_avg_std) {
print(lambda)
print(alpha)
tt <-
tibble(run=1:n) %>%
mutate(train=map(run, ~ sample(1:sample_size, train_size))) %>%
mutate(fit=map(train, ~ glmnet(x=x[.,], y=y[.], lambda = lambda, alpha = alpha ))) %>%
mutate(pred=map2(fit, train, ~ predict(.x, newx = x[-.y,]))) %>%
mutate(cor=map2_dbl(pred, train, ~ cor(.x, y[-.y]) )) %>%
mutate(MSE=map2_dbl(pred, train, ~ mean((y[-.y] - .x)^2))) %>%
summarize(
num_na=sum(is.na(cor)),
num_lt_0=sum(cor<=0, na.rm=TRUE),
avg_cor=mean(cor, na.rm=TRUE),
avg_MSE=mean(MSE))
tt
}
per_mg_fit_test_train <- met_per_mg_summary_lambda %>%
select(alpha, lambda.min.mean)
per_mg_fit_test_train <- met_per_mg_multiCV %>%
filter(run==1) %>%
select(alpha, fit) %>%
right_join(per_mg_fit_test_train)
Joining, by = "alpha"
per_mg_fit_test_train <- per_mg_fit_test_train %>%
mutate(pred_full=map2(fit, lambda.min.mean, ~ predict(.x, s=.y, newx=met_per_mg.PCs)),
full_R=map_dbl(pred_full, ~ cor(.x, leaflength$leaf_avg_std)),
full_MSE=map_dbl(pred_full, ~ mean((leaflength$leaf_avg_std-.x)^2))) %>%
mutate(tt=map2(lambda.min.mean, alpha, ~ multi_tt(lambda=.x, alpha=.y, x=met_per_mg.PCs)))
[1] 13.36988
[1] 0
[1] 0.7666849
[1] 0.1
[1] 0.8732274
[1] 0.2
[1] 0.6950232
[1] 0.3
[1] 0.6122284
[1] 0.4
[1] 0.5404081
[1] 0.5
[1] 0.4719584
[1] 0.6
[1] 0.4194147
[1] 0.7
[1] 0.3761253
[1] 0.8
[1] 0.3410749
[1] 0.9
[1] 0.3097519
[1] 1
(per_mg_fit_test_train <- per_mg_fit_test_train %>% unnest(tt))
per_mg_fit_test_train %>%
ggplot(aes(x=alpha)) +
geom_line(aes(y=avg_cor), color="red") +
geom_point(aes(y=avg_cor), color="red") +
geom_line(aes(y=avg_MSE), color="blue") +
geom_point(aes(y=avg_MSE), color="blue")
alpha of 0.8 to 1.0 are very similar and are the best here.
alpha_per_mg <- .8
best_per_mg <- per_mg_fit_test_train %>% filter(alpha == alpha_per_mg)
best_per_mg_fit <- best_per_mg$fit[[1]]
best_per_mg_lambda <- best_per_mg$lambda.min.mean
per_mg_coef.tb <- coef(best_per_mg_fit, s=best_per_mg_lambda) %>%
as.matrix() %>% as.data.frame() %>%
rownames_to_column(var="PC") %>%
rename(beta=`1`)
per_mg_coef.tb %>% filter(beta!=0) %>% arrange(beta)
NA
pred and obs
plot(leaflength$leaf_avg_std, best_per_mg$pred_full[[1]])
cor.test(leaflength$leaf_avg_std, best_per_mg$pred_full[[1]]) #.57
Pearson's product-moment correlation
data: leaflength$leaf_avg_std and best_per_mg$pred_full[[1]]
t = 4.1276, df = 34, p-value = 0.0002243
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.3076242 0.7617163
sample estimates:
cor
0.5777675
best_per_mg$full_MSE
[1] 0.7717674
per_mg_vars <- per_mg_coef.tb %>%
filter(beta !=0, PC!="(Intercept)") %>%
pull(PC) %>% c("leaf_avg_std", .)
per_mg_relimp <- leaflength %>% select(leaf_avg_std) %>% cbind(met_per_mg.PCs) %>% as.data.frame() %>% dplyr::select(all_of(per_mg_vars)) %>%
calc.relimp()
per_mg_coef.tb <- per_mg_relimp@lmg %>% as.matrix() %>% as.data.frame() %>%
rownames_to_column("PC") %>%
rename(PropVar_met_per_mg=V1) %>%
full_join(per_mg_coef.tb) %>%
arrange(desc(PropVar_met_per_mg))
Joining, by = "PC"
per_mg_coef.tb
NA
Checkout the rotations.
met_per_mg_rotation_out <- met_per_mg.PC_rotation %>%
pivot_longer(-metabolite, names_to="PC", values_to="loading") %>%
filter(PC %in% filter(per_mg_coef.tb, beta!=0)$PC ) %>%
group_by(PC) %>%
filter(!str_detect(metabolite,".*(leaf|root)_[0-9]*$")) %>%
filter(abs(loading) >= 0.05) %>%
left_join(per_mg_coef.tb, by="PC") %>%
arrange(desc(abs(beta)), desc(abs(loading))) %>%
mutate(organ=ifelse(str_detect(metabolite, "_leaf_"), "leaf", "root"),
transformation="normalized",
metabolite=str_remove(metabolite, "met_per_mg_(root|leaf)_"),
metabolite_effect_on_leaf=ifelse(beta*loading>0, "increase", "decrease"))
met_per_mg_rotation_out %>% write_csv("../output/Leaf_associated_metaboiltes_normalized.csv")
Fit 101 CVs for each of 11 alphas
set.seed(1245)
folds <- tibble(run=1:101) %>%
mutate(folds=map(run, ~ sample(rep(1:6,6))))
system.time (met_amt_multiCV <- expand_grid(run=1:100, alpha=round(seq(0,1,.1),1)) %>%
left_join(folds, by="run") %>%
mutate(fit=map2(folds, alpha, ~ cv.glmnet(x=met_amt.PCs, y=leaflength$leaf_avg_std, foldid = .x, alpha=.y
)))
#, lambda=exp(seq(-5,0,length.out = 50)) )))
) #100 seconds
user system elapsed
124.177 6.692 149.773
head(met_amt_multiCV)
for each fit, pull out the mean cv error, lambda, min lambda, and 1se lambda
met_amt_multiCV <- met_amt_multiCV %>%
mutate(cvm=map(fit, magrittr::extract("cvm")),
lambda=map(fit, magrittr::extract("lambda")),
lambda.min=map_dbl(fit, magrittr::extract("lambda.min" )),
lambda.1se=map_dbl(fit, magrittr::extract("lambda.1se")),
nzero=map(fit, magrittr::extract("nzero"))
)
head(met_amt_multiCV)
now calculate the mean and sem of cvm and min,1se labmdas. These need to be done separately because of the way the grouping works
met_amt_summary_cvm <- met_amt_multiCV %>% dplyr::select(-fit, -folds) %>%
unnest(c(cvm, lambda)) %>%
group_by(alpha, lambda) %>%
summarize(meancvm=mean(cvm), sem=sd(cvm)/sqrt(n()), high=meancvm+sem, low=meancvm-sem)
`summarise()` regrouping output by 'alpha' (override with `.groups` argument)
met_amt_summary_cvm
met_amt_summary_lambda <- met_amt_multiCV %>% dplyr::select(-fit, -folds, -cvm) %>%
group_by(alpha) %>%
summarize(
lambda.min.sd=sd(lambda.min),
lambda.min.mean=mean(lambda.min),
#lambda.min.med=median(lambda.min),
lambda.min.high=lambda.min.mean+lambda.min.sd,
#lambda.min.low=lambda.min.mean-lambda.min.sem,
#lambda.1se.sem=sd(lambda.1se)/sqrt(n()),
lambda.1se.mean=mean(lambda.1se),
#lambda.1se.med=median(lambda.1se),
#lambda.1se.high=lambda.1se+lambda.1se.sem,
#lambda.1se.low=lambda.1se-lambda.1se.sem,
nzero=nzero[1],
lambda=lambda[1]
)
`summarise()` ungrouping output (override with `.groups` argument)
met_amt_summary_lambda
plot it
met_amt_summary_cvm %>%
#filter(alpha!=0) %>% # worse than everything else and throwing the plots off
ggplot(aes(x=log(lambda), y= meancvm, ymin=low, ymax=high)) +
geom_ribbon(alpha=.25) +
geom_line(aes(color=as.character(alpha))) +
facet_wrap(~ as.character(alpha)) +
coord_cartesian(xlim=(c(-5,0))) +
geom_vline(aes(xintercept=log(lambda.min.mean)), alpha=.5, data=met_amt_summary_lambda) +
geom_vline(aes(xintercept=log(lambda.min.high)), alpha=.5, data=met_amt_summary_lambda, color="blue")
Make a plot of MSE at minimum lambda for each alpha
met_amt_summary_cvm %>%
group_by(alpha) %>%
filter(rank(meancvm, ties.method = "first")==1) %>%
ggplot(aes(x=alpha,y=meancvm,ymin=low,ymax=high)) +
geom_ribbon(color=NA, fill="gray80") +
geom_line() +
geom_point()
not a particular large difference here after 0.2
Plot the number of nzero coefficients
met_amt_summary_lambda %>%
unnest(c(lambda, nzero)) %>%
group_by(alpha) %>%
filter(abs(lambda.min.mean-lambda)==min(abs(lambda.min.mean-lambda)) ) %>%
ungroup() %>%
ggplot(aes(x=as.character(alpha), y=nzero)) +
geom_point() +
ggtitle("Number of non-zero coefficents at minimum lambda") +
ylim(0,36)
OK let’s do repeated test train starting from these CV lambdas
multi_tt <- function(lambda, alpha, n=10000, sample_size=36, train_size=30, x, y=leaflength$leaf_avg_std) {
print(lambda)
print(alpha)
tt <-
tibble(run=1:n) %>%
mutate(train=map(run, ~ sample(1:sample_size, train_size))) %>%
mutate(fit=map(train, ~ glmnet(x=x[.,], y=y[.], lambda = lambda, alpha = alpha ))) %>%
mutate(pred=map2(fit, train, ~ predict(.x, newx = x[-.y,]))) %>%
mutate(cor=map2_dbl(pred, train, ~ cor(.x, y[-.y]) )) %>%
mutate(MSE=map2_dbl(pred, train, ~ mean((y[-.y] - .x)^2))) %>%
summarize(
num_na=sum(is.na(cor)),
num_lt_0=sum(cor<=0, na.rm=TRUE),
avg_cor=mean(cor, na.rm=TRUE),
avg_MSE=mean(MSE))
tt
}
amt_fit_test_train <- met_amt_summary_lambda %>%
select(alpha, lambda.min.mean)
amt_fit_test_train <- met_amt_multiCV %>%
filter(run==1) %>%
select(alpha, fit) %>%
right_join(amt_fit_test_train)
Joining, by = "alpha"
amt_fit_test_train <- amt_fit_test_train %>%
mutate(pred_full=map2(fit, lambda.min.mean, ~ predict(.x, s=.y, newx=met_amt.PCs)),
full_R=map_dbl(pred_full, ~ cor(.x, leaflength$leaf_avg_std)),
full_MSE=map_dbl(pred_full, ~ mean((leaflength$leaf_avg_std-.x)^2))) %>%
mutate(tt=map2(lambda.min.mean, alpha, ~ multi_tt(lambda=.x, alpha=.y, x=met_amt.PCs)))
[1] 121.4399
[1] 0
[1] 0.9724185
[1] 0.1
[1] 0.6990522
[1] 0.2
[1] 0.586571
[1] 0.3
[1] 0.5167138
[1] 0.4
[1] 0.444683
[1] 0.5
[1] 0.3928743
[1] 0.6
[1] 0.3464051
[1] 0.7
[1] 0.3173148
[1] 0.8
[1] 0.2852059
[1] 0.9
[1] 0.2623291
[1] 1
(amt_fit_test_train <- amt_fit_test_train %>% unnest(tt))
amt_fit_test_train %>%
ggplot(aes(x=alpha)) +
geom_line(aes(y=avg_cor), color="red") +
geom_point(aes(y=avg_cor), color="red") +
geom_line(aes(y=avg_MSE), color="blue") +
geom_point(aes(y=avg_MSE), color="blue")
alpha of 0.8 to 1.0 are very similar and are the best here.
alpha_amt <- .8
best_amt <- amt_fit_test_train %>% filter(alpha == alpha_amt)
best_amt_fit <- best_amt$fit[[1]]
best_amt_lambda <- best_amt$lambda.min.mean
amt_coef.tb <- coef(best_amt_fit, s=best_amt_lambda) %>%
as.matrix() %>% as.data.frame() %>%
rownames_to_column(var="PC") %>%
rename(beta=`1`)
amt_coef.tb %>% filter(beta!=0) %>% arrange(beta)
NA
pred and obs
plot(leaflength$leaf_avg_std, best_amt$pred_full[[1]])
cor.test(leaflength$leaf_avg_std, best_amt$pred_full[[1]]) #.736
Pearson's product-moment correlation
data: leaflength$leaf_avg_std and best_amt$pred_full[[1]]
t = 6.3375, df = 34, p-value = 3.15e-07
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5372665 0.8571965
sample estimates:
cor
0.7359065
best_amt$full_MSE
[1] 0.6064862
amt_vars <- amt_coef.tb %>%
filter(beta !=0, PC!="(Intercept)") %>%
pull(PC) %>% c("leaf_avg_std", .)
amt_relimp <- leaflength %>% select(leaf_avg_std) %>% cbind(met_amt.PCs) %>% as.data.frame() %>% dplyr::select(all_of(amt_vars)) %>%
calc.relimp()
amt_coef.tb <- amt_relimp@lmg %>% as.matrix() %>% as.data.frame() %>%
rownames_to_column("PC") %>%
rename(PropVar_met_amt=V1) %>%
full_join(amt_coef.tb) %>%
arrange(desc(PropVar_met_amt))
Joining, by = "PC"
amt_coef.tb
NA
Checkout the rotations.
met_amt_rotation_out <- met_amt.PC_rotation %>%
pivot_longer(-metabolite, names_to="PC", values_to="loading") %>%
filter(PC %in% filter(amt_coef.tb, beta!=0)$PC ) %>%
group_by(PC) %>%
filter(!str_detect(metabolite,".*(leaf|root)_[0-9]*$")) %>%
filter(abs(loading) >= 0.05) %>%
left_join(amt_coef.tb, by="PC") %>%
arrange(desc(abs(beta)), desc(abs(loading))) %>%
mutate(organ=ifelse(str_detect(metabolite, "_leaf_"), "leaf", "root"),
transformation="normalized",
metabolite=str_remove(metabolite, "met_amt_(root|leaf)_"),
metabolite_effect_on_leaf=ifelse(beta*loading>0, "increase", "decrease"))
met_amt_rotation_out %>% write_csv("../output/Leaf_associated_metaboiltes_raw.csv")